Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parallel commands based on hostname #397

Merged
merged 2 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions doc/user_manual/src/Howtorun.tex
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,32 @@ \subsection{How to run}
\end{lstlisting}
Alternatively, you can use Python to run \texttt{HERON/src/main.py} with the HERON XML input as
argument; however, this will bypass loading the \texttt{raven\_libraries} and other initialization.

\subsection{Parallel Notes}

HERON uses RAVEN's parallel tools. Since running on different clusters
can require somewhat different commands, HERON allows the commands
used for parallel running to be chosen based on the hostname.

These are stored in the directory \texttt{templates/parallel}. Example:

\begin{lstlisting}[style=XML]
<parallel hostregexp="sawtooth[12].*">
<useParallel>
<mode>
mpi
<runQSUB />
</mode>
</useParallel>
<outer>
<parallelMethod>ray</parallelMethod>
</outer>
</parallel>
\end{lstlisting}

The \texttt{hostregexp} is a regular expression and the first regular
expression that matches the hostname will be used as the template for
running in parallel. If parallel is used, then the section in
\texttt{useParallel} will be added to the RunInfo in the RAVEN
input. If the batch size is greater than one then the code in the
section \texttt{outer} will be used.
11 changes: 11 additions & 0 deletions templates/parallel/bitterroot.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<parallel hostregexp="bitterroot[12].*">
<useParallel>
<mode>slurm
<runSbatch />
</mode>
</useParallel>
<outer>
<parallelMethod>dask</parallelMethod>
</outer>
</parallel>

11 changes: 11 additions & 0 deletions templates/parallel/sawtooth.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<parallel hostregexp="sawtooth[12].*">
<useParallel>
<mode>
mpi
<runQSUB />
</mode>
</useParallel>
<outer>
<parallelMethod>ray</parallelMethod>
</outer>
</parallel>
71 changes: 59 additions & 12 deletions templates/template_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
import shutil
import xml.etree.ElementTree as ET
import itertools as it
import socket
import glob
import re

import numpy as np
import dill as pk
Expand Down Expand Up @@ -237,6 +240,23 @@ def _modify_outer_mode(self, template, case, components, sources):
elif case.get_mode() == 'opt':
template.find('Samplers').remove(template.find(".//Grid[@name='grid']"))

def _get_parallel_xml(self, hostname):
"""
Finds the xml file to go with the given hostname.
@ In, hostname, string with the hostname to search for
@ Out, xml, xml.eTree.ElementTree or None, if an xml file is found then use it, otherwise return None
"""
# Should this allow loading from another directory (such as one
# next to the input file?)
path = os.path.join(os.path.dirname(__file__),"parallel","*.xml")
filenames = glob.glob(path)
for filename in filenames:
cur_xml = ET.parse(filename).getroot()
regexp = cur_xml.attrib['hostregexp']
if re.match(regexp, hostname):
return cur_xml
return None

def _modify_outer_runinfo(self, template, case):
"""
Defines modifications to the RunInfo of outer.xml RAVEN input file.
Expand All @@ -257,15 +277,26 @@ def _modify_outer_runinfo(self, template, case):
elif case.get_mode() == 'opt':
run_info.find('Sequence').text = 'optimize, plot'
# parallel
# Should there be a way to override the hostname (such as if we are
# generating the files to run on a different computer?)
hostname = socket.gethostbyaddr(socket.gethostname())[0]
self.parallel_xml = self._get_parallel_xml(hostname)
#note, parallel_xml might be None
if case.outerParallel:
# set outer batchsize and InternalParallel
batchSize = run_info.find('batchSize')
batchSize.text = f'{case.outerParallel}'
run_info.append(xmlUtils.newNode('internalParallel', text='True'))
self._modify_outer_parallel(template, case)
if case.useParallel:
#XXX this doesn't handle non-mpi modes like torque or other custom ones
mode = xmlUtils.newNode('mode', text='mpi')
mode.append(xmlUtils.newNode('runQSUB'))
if self.parallel_xml is None:
#this doesn't handle non-mpi modes like torque or other custom ones
# so it is highly recommended that a parallel xml template be created
# for hosts that are using those.
mode = xmlUtils.newNode('mode', text='mpi')
mode.append(xmlUtils.newNode('runQSUB'))
else:
for child in self.parallel_xml.find('useParallel'):
if child.tag == 'mode':
mode = child
else:
run_info.append(child)
if 'memory' in case.parallelRunInfo:
mode.append(xmlUtils.newNode('memory', text=case.parallelRunInfo.pop('memory')))
for sub in case.parallelRunInfo:
Expand All @@ -274,6 +305,26 @@ def _modify_outer_runinfo(self, template, case):
if case.innerParallel:
run_info.append(xmlUtils.newNode('NumMPI', text=case.innerParallel))

def _modify_outer_parallel(self, template, case):
"""
Modifies the outer parallel stuff. This should only be called if
case.outerparallel > 0
@ In, template, xml.etree.ElementTree.Element, root of XML to modify
@ In, case, HERON Case, defining Case instance
@ Out, None

"""
run_info = template.find('RunInfo')
# set outer batchsize and InternalParallel
batchSize = run_info.find('batchSize')
batchSize.text = f'{case.outerParallel}'
if self.parallel_xml is None:
run_info.append(xmlUtils.newNode('internalParallel', text='True'))
else:
#append all the children in the 'outer' element
for child in self.parallel_xml.find('outer'):
run_info.append(child)

def _modify_outer_vargroups(self, template, case, components, sources):
"""
Defines modifications to the VariableGroups of outer.xml RAVEN input file.
Expand Down Expand Up @@ -703,12 +754,8 @@ def _modify_outer_samplers(self, template, case, components):
#XXX if we had a way to calculate this ahead of time,
# this could be done in _modify_outer_runinfo
#Need to update the outerParallel number
run_info = template.find('RunInfo')
case.outerParallel = len(self.__sweep_vars) + 1
#XXX duplicate of code in _modify_outer_runinfo
batchSize = run_info.find('batchSize')
batchSize.text = f'{case.outerParallel}'
run_info.append(xmlUtils.newNode('internalParallel', text='True'))
self._modify_outer_parallel(template, case)

def _modify_outer_optimizers(self, template, case):
"""
Expand Down
Loading