Skip to content

Commit

Permalink
Merge pull request #397 from joshua-cogliati-inl/hostname_parallel
Browse files Browse the repository at this point in the history
Add parallel commands based on hostname
  • Loading branch information
dylanjm authored Dec 18, 2024
2 parents 190ca29 + ddf561c commit 3855918
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 12 deletions.
29 changes: 29 additions & 0 deletions doc/user_manual/src/Howtorun.tex
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,32 @@ \subsection{How to run}
\end{lstlisting}
Alternatively, you can use Python to run \texttt{HERON/src/main.py} with the HERON XML input as
argument; however, this will bypass loading the \texttt{raven\_libraries} and other initialization.

\subsection{Parallel Notes}

HERON uses RAVEN's parallel tools. Since running on different clusters
can require somewhat different commands, HERON allows the commands
used for parallel running to be chosen based on the hostname.

These are stored in the directory \texttt{templates/parallel}. Example:

\begin{lstlisting}[style=XML]
<parallel hostregexp="sawtooth[12].*">
<useParallel>
<mode>
mpi
<runQSUB />
</mode>
</useParallel>
<outer>
<parallelMethod>ray</parallelMethod>
</outer>
</parallel>
\end{lstlisting}

The \texttt{hostregexp} is a regular expression and the first regular
expression that matches the hostname will be used as the template for
running in parallel. If parallel is used, then the section in
\texttt{useParallel} will be added to the RunInfo in the RAVEN
input. If the batch size is greater than one then the code in the
section \texttt{outer} will be used.
11 changes: 11 additions & 0 deletions templates/parallel/bitterroot.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<parallel hostregexp="bitterroot[12].*">
<useParallel>
<mode>slurm
<runSbatch />
</mode>
</useParallel>
<outer>
<parallelMethod>dask</parallelMethod>
</outer>
</parallel>

11 changes: 11 additions & 0 deletions templates/parallel/sawtooth.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<parallel hostregexp="sawtooth[12].*">
<useParallel>
<mode>
mpi
<runQSUB />
</mode>
</useParallel>
<outer>
<parallelMethod>ray</parallelMethod>
</outer>
</parallel>
71 changes: 59 additions & 12 deletions templates/template_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
import shutil
import xml.etree.ElementTree as ET
import itertools as it
import socket
import glob
import re

import numpy as np
import dill as pk
Expand Down Expand Up @@ -237,6 +240,23 @@ def _modify_outer_mode(self, template, case, components, sources):
elif case.get_mode() == 'opt':
template.find('Samplers').remove(template.find(".//Grid[@name='grid']"))

def _get_parallel_xml(self, hostname):
"""
Finds the xml file to go with the given hostname.
@ In, hostname, string with the hostname to search for
@ Out, xml, xml.eTree.ElementTree or None, if an xml file is found then use it, otherwise return None
"""
# Should this allow loading from another directory (such as one
# next to the input file?)
path = os.path.join(os.path.dirname(__file__),"parallel","*.xml")
filenames = glob.glob(path)
for filename in filenames:
cur_xml = ET.parse(filename).getroot()
regexp = cur_xml.attrib['hostregexp']
if re.match(regexp, hostname):
return cur_xml
return None

def _modify_outer_runinfo(self, template, case):
"""
Defines modifications to the RunInfo of outer.xml RAVEN input file.
Expand All @@ -257,15 +277,26 @@ def _modify_outer_runinfo(self, template, case):
elif case.get_mode() == 'opt':
run_info.find('Sequence').text = 'optimize, plot'
# parallel
# Should there be a way to override the hostname (such as if we are
# generating the files to run on a different computer?)
hostname = socket.gethostbyaddr(socket.gethostname())[0]
self.parallel_xml = self._get_parallel_xml(hostname)
#note, parallel_xml might be None
if case.outerParallel:
# set outer batchsize and InternalParallel
batchSize = run_info.find('batchSize')
batchSize.text = f'{case.outerParallel}'
run_info.append(xmlUtils.newNode('internalParallel', text='True'))
self._modify_outer_parallel(template, case)
if case.useParallel:
#XXX this doesn't handle non-mpi modes like torque or other custom ones
mode = xmlUtils.newNode('mode', text='mpi')
mode.append(xmlUtils.newNode('runQSUB'))
if self.parallel_xml is None:
#this doesn't handle non-mpi modes like torque or other custom ones
# so it is highly recommended that a parallel xml template be created
# for hosts that are using those.
mode = xmlUtils.newNode('mode', text='mpi')
mode.append(xmlUtils.newNode('runQSUB'))
else:
for child in self.parallel_xml.find('useParallel'):
if child.tag == 'mode':
mode = child
else:
run_info.append(child)
if 'memory' in case.parallelRunInfo:
mode.append(xmlUtils.newNode('memory', text=case.parallelRunInfo.pop('memory')))
for sub in case.parallelRunInfo:
Expand All @@ -274,6 +305,26 @@ def _modify_outer_runinfo(self, template, case):
if case.innerParallel:
run_info.append(xmlUtils.newNode('NumMPI', text=case.innerParallel))

def _modify_outer_parallel(self, template, case):
"""
Modifies the outer parallel stuff. This should only be called if
case.outerparallel > 0
@ In, template, xml.etree.ElementTree.Element, root of XML to modify
@ In, case, HERON Case, defining Case instance
@ Out, None
"""
run_info = template.find('RunInfo')
# set outer batchsize and InternalParallel
batchSize = run_info.find('batchSize')
batchSize.text = f'{case.outerParallel}'
if self.parallel_xml is None:
run_info.append(xmlUtils.newNode('internalParallel', text='True'))
else:
#append all the children in the 'outer' element
for child in self.parallel_xml.find('outer'):
run_info.append(child)

def _modify_outer_vargroups(self, template, case, components, sources):
"""
Defines modifications to the VariableGroups of outer.xml RAVEN input file.
Expand Down Expand Up @@ -703,12 +754,8 @@ def _modify_outer_samplers(self, template, case, components):
#XXX if we had a way to calculate this ahead of time,
# this could be done in _modify_outer_runinfo
#Need to update the outerParallel number
run_info = template.find('RunInfo')
case.outerParallel = len(self.__sweep_vars) + 1
#XXX duplicate of code in _modify_outer_runinfo
batchSize = run_info.find('batchSize')
batchSize.text = f'{case.outerParallel}'
run_info.append(xmlUtils.newNode('internalParallel', text='True'))
self._modify_outer_parallel(template, case)

def _modify_outer_optimizers(self, template, case):
"""
Expand Down

0 comments on commit 3855918

Please sign in to comment.